Loading Packages

library(data.table) # Efficient Dataframe 
library(lubridate) # For Dates 
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression Tree

Dataset Preparation

Loading the dataset called “ToyotaCorolla.csv”

# Load the Dataset with Fread()
ToyotaDT <- fread("DATA/ToyotaCorolla.csv")

Quick Preview

# Preview of the Dataset
DT::datatable(head(ToyotaDT,2))

Dataset Description: The file ToyotaCorolla.csv contains data on used cars on sale during late summer of 2004 in the Netherlands. It has 1436 records containing detail on 38 attributes, including Price, Age, Kilometers, HP (Horse Power), and other specifications.

dfSummary(ToyotaDT, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

ToyotaDT

Dimensions: 1436 x 39
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 Id
[integer]
Mean (sd) : 721.6 (416.5)
min < med < max:
1 < 721.5 < 1442
IQR (CV) : 719.5 (0.6)
1436 distinct values 0
(0.0%)
2 Model
[character]
1. TOYOTA Corolla 1.6 16V HA
2. TOYOTA Corolla 1.3 16V HA
3. TOYOTA Corolla 1.6 16V LI
4. TOYOTA Corolla 1.6 16V LI
5. TOYOTA Corolla 1.6 16V SE
6. TOYOTA Corolla 1.4 16V VV
7. TOYOTA Corolla 1.3 16V LI
8. TOYOTA Corolla 1.6 16V VV
9. TOYOTA Corolla 1.6 16V WA
10. TOYOTA Corolla 1.6 16V VV
[ 362 others ]
107 ( 7.5%)
83 ( 5.8%)
79 ( 5.5%)
70 ( 4.9%)
43 ( 3.0%)
42 ( 2.9%)
35 ( 2.4%)
31 ( 2.2%)
28 ( 1.9%)
27 ( 1.9%)
891 (62.0%)
0
(0.0%)
3 Price
[integer]
Mean (sd) : 10730.8 (3627)
min < med < max:
4350 < 9900 < 32500
IQR (CV) : 3500 (0.3)
236 distinct values 0
(0.0%)
4 Age_08_04
[integer]
Mean (sd) : 55.9 (18.6)
min < med < max:
1 < 61 < 80
IQR (CV) : 26 (0.3)
77 distinct values 0
(0.0%)
5 Mfg_Month
[integer]
Mean (sd) : 5.5 (3.4)
min < med < max:
1 < 5 < 12
IQR (CV) : 5 (0.6)
12 distinct values 0
(0.0%)
6 Mfg_Year
[integer]
Mean (sd) : 1999.6 (1.5)
min < med < max:
1998 < 1999 < 2004
IQR (CV) : 3 (0)
1998 : 392 (27.3%)
1999 : 441 (30.7%)
2000 : 225 (15.7%)
2001 : 192 (13.4%)
2002 : 87 ( 6.1%)
2003 : 75 ( 5.2%)
2004 : 24 ( 1.7%)
0
(0.0%)
7 KM
[integer]
Mean (sd) : 68533.3 (37506.4)
min < med < max:
1 < 63389.5 < 243000
IQR (CV) : 44020.8 (0.5)
1263 distinct values 0
(0.0%)
8 Fuel_Type
[character]
1. CNG
2. Diesel
3. Petrol
17 ( 1.2%)
155 (10.8%)
1264 (88.0%)
0
(0.0%)
9 HP
[integer]
Mean (sd) : 101.5 (15)
min < med < max:
69 < 110 < 192
IQR (CV) : 20 (0.1)
12 distinct values 0
(0.0%)
10 Met_Color
[integer]
Min : 0
Mean : 0.7
Max : 1
0 : 467 (32.5%)
1 : 969 (67.5%)
0
(0.0%)
11 Color
[character]
1. Beige
2. Black
3. Blue
4. Green
5. Grey
6. Red
7. Silver
8. Violet
9. White
10. Yellow
3 ( 0.2%)
191 (13.3%)
283 (19.7%)
220 (15.3%)
301 (21.0%)
278 (19.4%)
122 ( 8.5%)
4 ( 0.3%)
31 ( 2.2%)
3 ( 0.2%)
0
(0.0%)
12 Automatic
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1356 (94.4%)
1 : 80 ( 5.6%)
0
(0.0%)
13 CC
[integer]
Mean (sd) : 1576.9 (424.4)
min < med < max:
1300 < 1600 < 16000
IQR (CV) : 200 (0.3)
13 distinct values 0
(0.0%)
14 Doors
[integer]
Mean (sd) : 4 (1)
min < med < max:
2 < 4 < 5
IQR (CV) : 2 (0.2)
2 : 2 ( 0.1%)
3 : 622 (43.3%)
4 : 138 ( 9.6%)
5 : 674 (46.9%)
0
(0.0%)
15 Cylinders
[integer]
1 distinct value 4 : 1436 (100.0%) 0
(0.0%)
16 Gears
[integer]
Mean (sd) : 5 (0.2)
min < med < max:
3 < 5 < 6
IQR (CV) : 0 (0)
3 : 2 ( 0.1%)
4 : 1 ( 0.1%)
5 : 1390 (96.8%)
6 : 43 ( 3.0%)
0
(0.0%)
17 Quarterly_Tax
[integer]
Mean (sd) : 87.1 (41.1)
min < med < max:
19 < 85 < 283
IQR (CV) : 16 (0.5)
13 distinct values 0
(0.0%)
18 Weight
[integer]
Mean (sd) : 1072.5 (52.6)
min < med < max:
1000 < 1070 < 1615
IQR (CV) : 45 (0)
59 distinct values 0
(0.0%)
19 Mfr_Guarantee
[integer]
Min : 0
Mean : 0.4
Max : 1
0 : 848 (59.1%)
1 : 588 (40.9%)
0
(0.0%)
20 BOVAG_Guarantee
[integer]
Min : 0
Mean : 0.9
Max : 1
0 : 150 (10.4%)
1 : 1286 (89.6%)
0
(0.0%)
21 Guarantee_Period
[integer]
Mean (sd) : 3.8 (3)
min < med < max:
3 < 3 < 36
IQR (CV) : 0 (0.8)
3 : 1274 (88.7%)
6 : 77 ( 5.4%)
12 : 73 ( 5.1%)
13 : 1 ( 0.1%)
18 : 1 ( 0.1%)
20 : 1 ( 0.1%)
24 : 4 ( 0.3%)
28 : 1 ( 0.1%)
36 : 4 ( 0.3%)
0
(0.0%)
22 ABS
[integer]
Min : 0
Mean : 0.8
Max : 1
0 : 268 (18.7%)
1 : 1168 (81.3%)
0
(0.0%)
23 Airbag_1
[integer]
Min : 0
Mean : 1
Max : 1
0 : 42 ( 2.9%)
1 : 1394 (97.1%)
0
(0.0%)
24 Airbag_2
[integer]
Min : 0
Mean : 0.7
Max : 1
0 : 398 (27.7%)
1 : 1038 (72.3%)
0
(0.0%)
25 Airco
[integer]
Min : 0
Mean : 0.5
Max : 1
0 : 706 (49.2%)
1 : 730 (50.8%)
0
(0.0%)
26 Automatic_airco
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1355 (94.4%)
1 : 81 ( 5.6%)
0
(0.0%)
27 Boardcomputer
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1013 (70.5%)
1 : 423 (29.5%)
0
(0.0%)
28 CD_Player
[integer]
Min : 0
Mean : 0.2
Max : 1
0 : 1122 (78.1%)
1 : 314 (21.9%)
0
(0.0%)
29 Central_Lock
[integer]
Min : 0
Mean : 0.6
Max : 1
0 : 603 (42.0%)
1 : 833 (58.0%)
0
(0.0%)
30 Powered_Windows
[integer]
Min : 0
Mean : 0.6
Max : 1
0 : 629 (43.8%)
1 : 807 (56.2%)
0
(0.0%)
31 Power_Steering
[integer]
Min : 0
Mean : 1
Max : 1
0 : 32 ( 2.2%)
1 : 1404 (97.8%)
0
(0.0%)
32 Radio
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1226 (85.4%)
1 : 210 (14.6%)
0
(0.0%)
33 Mistlamps
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1067 (74.3%)
1 : 369 (25.7%)
0
(0.0%)
34 Sport_Model
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1005 (70.0%)
1 : 431 (30.0%)
0
(0.0%)
35 Backseat_Divider
[integer]
Min : 0
Mean : 0.8
Max : 1
0 : 330 (23.0%)
1 : 1106 (77.0%)
0
(0.0%)
36 Metallic_Rim
[integer]
Min : 0
Mean : 0.2
Max : 1
0 : 1142 (79.5%)
1 : 294 (20.5%)
0
(0.0%)
37 Radio_cassette
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1227 (85.4%)
1 : 209 (14.6%)
0
(0.0%)
38 Parking_Assistant
[integer]
Min : 0
Mean : 0
Max : 1
0 : 1432 (99.7%)
1 : 4 ( 0.3%)
0
(0.0%)
39 Tow_Bar
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1037 (72.2%)
1 : 399 (27.8%)
0
(0.0%)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(ToyotaDT, show_pct = TRUE)

We can see that there is no missing values in our dataset ToyotaCorolla.csv

Ex 6.4

Predicting Prices of Used Cars

Split the data into training (50%), validation (30%), and test (20%) datasets.

# Setting Seed
set.seed(1)

# Splitting Training and Validation and Test
splitting <- sample(1:3,size=nrow(ToyotaDT),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ToyotaDT[splitting==1,]
Validation <- ToyotaDT[splitting==2,]
Test <- ToyotaDT[splitting==3,]

# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ToyotaDT))*100
Prop_Validation <- (nrow(Validation)/nrow(ToyotaDT))*100
Prop_Test <- (nrow(Test)/nrow(ToyotaDT))*100

# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")

[1] “The Proportions are: 52.58 % In Training, 27.79 % In Validation, and 19.64 % In Test”


Run a multiple linear regression

with the outcome variable Price and predictor variables Age_08, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.


Outcome Variable

Numerical: Price


Explanatory Variables

Numerical: Age_08, KM, HP, Doors, Quarterly_Tax, Quarantee_Period

Categorical/Dummy: Fuel Type, Automatic, Mfr_Guarantee, Airco, Automatic_airco, CD_Player, Powered Windows, Sport_Model, Tow_Bar


Running the Linear Regression

# Linear OLS Regression on Training 
Regression_Price <- lm(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training)

# Scientific Notation
options(scipen = 999)

summary(Regression_Price)
FALSE 
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + 
FALSE     Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + 
FALSE     Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + 
FALSE     Tow_Bar, data = Training)
FALSE 
FALSE Residuals:
FALSE     Min      1Q  Median      3Q     Max 
FALSE -4605.0  -725.0    26.5   655.5  5567.0 
FALSE 
FALSE Coefficients:
FALSE                      Estimate   Std. Error t value             Pr(>|t|)    
FALSE (Intercept)      10149.303878   753.570928  13.468 < 0.0000000000000002 ***
FALSE Age_08_04         -106.254953     3.648627 -29.122 < 0.0000000000000002 ***
FALSE KM                  -0.018110     0.001693 -10.697 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel   1775.761977   509.701393   3.484             0.000523 ***
FALSE Fuel_TypePetrol   2087.180327   530.110397   3.937   0.0000902216317796 ***
FALSE HP                  26.355347     3.846774   6.851   0.0000000000154436 ***
FALSE Automatic          605.249517   178.506754   3.391             0.000734 ***
FALSE Doors              153.627129    46.146978   3.329             0.000915 ***
FALSE Quarterly_Tax       16.814925     2.081831   8.077   0.0000000000000027 ***
FALSE Mfr_Guarantee      125.684485    91.024994   1.381             0.167768    
FALSE Guarantee_Period    72.439768    15.661721   4.625   0.0000044171894508 ***
FALSE Airco              175.868563   106.848246   1.646             0.100197    
FALSE Automatic_airco   3178.052845   209.461203  15.173 < 0.0000000000000002 ***
FALSE CD_Player          247.920668   120.057343   2.065             0.039270 *  
FALSE Powered_Windows    426.529261   103.224416   4.132   0.0000400718888028 ***
FALSE Sport_Model        268.489425    98.070812   2.738             0.006336 ** 
FALSE Tow_Bar           -250.505084    97.621305  -2.566             0.010481 *  
FALSE ---
FALSE Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE 
FALSE Residual standard error: 1155 on 738 degrees of freedom
FALSE Multiple R-squared:  0.8915,  Adjusted R-squared:  0.8891 
FALSE F-statistic: 378.8 on 16 and 738 DF,  p-value: < 0.00000000000000022

Taking into account all the requested variables in our linear regression without further analysis or variable selection, we can observe a fairly high Multiple R-squared, close to 0.8915 and a fit of 0.8891. We have a very significant F-statistic, close to 0 which means that this model is already quite complex and better than a naive model including only the intercept. The most significant variables are the Age of the car, the number of kilometres (KM) and the automatic air conditioning (Automatic_airco), since all have the smallest p-value.


Show Models

# show models
sum$which
FALSE    (Intercept) Age_08_04    KM Fuel_TypeCNG Fuel_TypeDiesel Fuel_TypePetrol
FALSE 1         TRUE      TRUE FALSE        FALSE           FALSE           FALSE
FALSE 2         TRUE      TRUE FALSE        FALSE           FALSE           FALSE
FALSE 3         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 4         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 5         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 6         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 7         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 8         TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 9         TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 10        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 11        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 12        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 13        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 14        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 15        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 16        TRUE      TRUE  TRUE         TRUE            TRUE           FALSE
FALSE       HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco
FALSE 1  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 2  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 3  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 4  FALSE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 5   TRUE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 6   TRUE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 7   TRUE     FALSE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 8   TRUE     FALSE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 9   TRUE      TRUE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 10  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 11  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 12  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 13  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 14  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE  TRUE
FALSE 15  TRUE      TRUE  TRUE          TRUE          TRUE             TRUE  TRUE
FALSE 16  TRUE      TRUE  TRUE          TRUE          TRUE             TRUE  TRUE
FALSE    Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar
FALSE 1            FALSE     FALSE           FALSE       FALSE   FALSE
FALSE 2             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 3             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 4             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 5             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 6             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 7             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 8             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 9             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 10            TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 11            TRUE     FALSE            TRUE        TRUE   FALSE
FALSE 12            TRUE     FALSE            TRUE        TRUE    TRUE
FALSE 13            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 14            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 15            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 16            TRUE      TRUE            TRUE        TRUE    TRUE


R-Squared

# show metrics
sum$rsq
FALSE  [1] 0.7655313 0.8300988 0.8503445 0.8620935 0.8759748 0.8801229 0.8822341
FALSE  [8] 0.8844385 0.8861651 0.8874965 0.8888897 0.8897505 0.8905682 0.8909579
FALSE [15] 0.8913048 0.8914512

R-Squared keep increasing when we add more and more variables to the model, which is already expected since it doesn’t account for the number of parameter (no penalty)


Adjusted R-Squared

sum$adjr2
FALSE  [1] 0.7652199 0.8296470 0.8497467 0.8613580 0.8751469 0.8791614 0.8811305
FALSE  [8] 0.8831993 0.8847899 0.8859844 0.8872447 0.8879675 0.8886484 0.8888949
FALSE [15] 0.8890985 0.8890979

Adjusted R-Squared keep increasing until the 15th value, 0.8890985 which is slight higher than the last one (16th).


Cp

sum$cp
FALSE  [1] 840.94284 404.55680 269.09689 191.32625  99.07820  72.91427  60.58056
FALSE  [8]  47.61333  37.89066  30.85073  23.39186  19.54705  15.99520  15.34962
FALSE [15]  14.99434  16.00000


The closest Cp to predictors+1 is the last one (16th value) which is 16, close to 17 = p+1 = 16+1.

a. What appear to be the three or four most important car specifications for predicting the car’s price?

After the Exhaustive Search and Popular Subset Selection Algorithms, we should have those 3-4 most important variable for predicting car’s price: Age, KM, Automatic_airco and Quarterly_Tax. Depending if we decided to drop the variable Mfr_Guarante, we might get slight different results in predictions, but the most important variables for predictions should stay the same. The Following Performance Assessment keep the same full model.

b. Using metrics you consider usefull, assess the performance of the model in predicting prices.

Computing Prediction with the Regression Model on Validation - Accuracy

library(forecast)
# use predict() to make predictions on a new set.
car.lm.pred <- predict(Regression_Price, Validation)
options(scipen=999, digits = 0)
some.residuals <- Validation$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = Validation$Price[1:20],
"Residual" = some.residuals)
FALSE    Predicted Actual Residual
FALSE 1      15327  13950    -1377
FALSE 2      14920  18600     3680
FALSE 3      17033  21500     4467
FALSE 4      20592  19600     -992
FALSE 5      20061  22500     2439
FALSE 6      19881  22750     2869
FALSE 7      14134  16950     2816
FALSE 8      17294  15950    -1344
FALSE 9      14919  15750      831
FALSE 10     14604  15750     1146
FALSE 11     14910  15950     1040
FALSE 12     14286  15750     1464
FALSE 13     14478  16750     2272
FALSE 14     14107  13950     -157
FALSE 15     15345  16950     1605
FALSE 16     16061  16950      889
FALSE 17     19033  19000      -33
FALSE 18     16568  17950     1382
FALSE 19     19668  21950     2282
FALSE 20     14663  15250      587
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, Validation$Price)
FALSE          ME RMSE MAE    MPE MAPE
FALSE Test set 83 1319 948 -0.789 8.87

Here the resulting metrics for our predictions errors. The closest to 0 the better.

Histogram of Validation Errors

library(forecast)
car.lm.pred <- predict(Regression_Price, Validation)
all.residuals <- Validation$Price - car.lm.pred
length(all.residuals[which(all.residuals > -1406 & all.residuals < 1406)])/400
FALSE [1] 0.802
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "", col = "#1c6155")

We can the residuals of our predictions on the validation set. The Spread is clearly visible, between -2000 and 2000 and normally distributed. Some extremes values for our residuals appear on the far right,.

Ex 9.3

Predicting Prices of Used Cars (Regression Trees).

Data Preprocessing - 60% Training and 40% Validation

# Set Seed 
set.seed(1)

ToyotaDT_TREE <- fread("DATA/ToyotaCorolla.csv")

# Input Cutting Ratio
Prob_Train <- 0.6
Prob_Validation <- 1 - Prob_Train

# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_TREE), replace=TRUE, prob=c(Prob_Train,Prob_Validation))
Training_TREE  <- ToyotaDT_TREE[sample, ]
Validation_TREE   <- ToyotaDT_TREE[!sample, ]

# Proportions Check
Prop_Train <- nrow(Training_TREE)/nrow(ToyotaDT_TREE)*100
Prop_Valid <- nrow(Validation_TREE)/nrow(ToyotaDT_TREE)*100

# Printing Proportions for double checking
print(paste(round(Prop_Train,2),"% In Training", round(Prop_Valid,2),"% In Validation"))
FALSE [1] "61.42 % In Training 38.58 % In Validation"

a. Run a Regression Tree

with CP=0.001 and “ANOVA” Method

# Set Seed
set.seed(1)

# Regression Tree Packages
library(rpart)
library(rpart.plot)

# As Factor Fuel_Type
#ToyotaDT_TREE$Fuel_Type <- as.factor(ToyotaDT_TREE$Fuel_Type)

# Regression Tree Parameters
cp_1 = 0.001
method = "anova"
minbucket = 1
maxdepth = 30 #30 maximum

# Running Regression Tree
Tree_1 <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_1,minbucket = minbucket, method=method))

# Plotting Regression Tree
Tree_1_Plot <- rpart.plot(Tree_1, type=0, varlen = 0, box.col=ifelse(Tree_1$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE)

# Number of Leafs
Length_Tree_1 = length(Tree_1$frame$var[Tree_1$frame$var == "<leaf>"])

print(paste("There is", Length_Tree_1,"Number of Leaves"))
FALSE [1] "There is 31 Number of Leaves"

CP Plot

# Tree Size
plotcp(Tree_1)

With small CP, Our Regression Tree can go until 31 Leaves… But we see that the relative error seems to already be small enough between 9 and 13 Leaves.

i. Which appear to be the three or four most important car specifications for predicting the car’s price?

# Most 3 or 4 specifications for predictions in car's price
Tree_1
FALSE n= 882 
FALSE 
FALSE node), split, n, deviance, yval
FALSE       * denotes terminal node
FALSE 
FALSE   1) root 882 11000000000 10700  
FALSE     2) Age_08_04>=31.5 773  3070000000  9630  
FALSE       4) Age_08_04>=56.5 516   839000000  8670  
FALSE         8) Age_08_04>=68.5 248   283000000  7970  
FALSE          16) KM>=9.94e+04 71    91800000  7400  
FALSE            32) KM>=2e+05 5     3580000  5700 *
FALSE            33) KM< 2e+05 66    72600000  7530 *
FALSE          17) KM< 9.94e+04 177   160000000  8190 *
FALSE         9) Age_08_04< 68.5 268   316000000  9330  
FALSE          18) KM>=1.38e+05 11     9700000  7390 *
FALSE          19) KM< 1.38e+05 257   263000000  9410  
FALSE            38) Quarterly_Tax< 78.5 144   130000000  9070  
FALSE              76) KM>=4.23e+04 126   101000000  8940  
FALSE               152) Airco< 0.5 86    59000000  8730 *
FALSE               153) Airco>=0.5 40    30600000  9390 *
FALSE              77) KM< 4.23e+04 18    12700000  9950 *
FALSE            39) Quarterly_Tax>=78.5 113    94200000  9850  
FALSE              78) KM>=8.82e+04 34    17000000  9270 *
FALSE              79) KM< 8.82e+04 79    60600000 10100 *
FALSE       5) Age_08_04< 56.5 257   830000000 11500  
FALSE        10) KM>=1.28e+05 13    35200000  7040  
FALSE          20) Quarterly_Tax< 74.5 4     7930000  5320 *
FALSE          21) Quarterly_Tax>=74.5 9    10200000  7810 *
FALSE        11) KM< 1.28e+05 244   518000000 11800  
FALSE          22) Age_08_04>=44.5 119   210000000 11000  
FALSE            44) Airco< 0.5 57    56200000 10600  
FALSE              88) Mfr_Guarantee< 0.5 33    25500000 10200 *
FALSE              89) Mfr_Guarantee>=0.5 24    18300000 11100 *
FALSE            45) Airco>=0.5 62   132000000 11400  
FALSE              90) KM>=5.66e+04 41    35600000 11000 *
FALSE              91) KM< 5.66e+04 21    71700000 12300  
FALSE               182) Quarterly_Tax>=44 19    24300000 11900 *
FALSE               183) Quarterly_Tax< 44 2    18000000 16000 *
FALSE          23) Age_08_04< 44.5 125   172000000 12500  
FALSE            46) Powered_Windows< 0.5 49    58700000 12000 *
FALSE            47) Powered_Windows>=0.5 76    90100000 12800 *
FALSE     3) Age_08_04< 31.5 109   896000000 18200  
FALSE       6) HP< 113 100   506000000 17700  
FALSE        12) Age_08_04>=22.5 39   137000000 16400  
FALSE          24) HP< 104 17    30100000 14900  
FALSE            48) Sport_Model< 0.5 6      929000 13500 *
FALSE            49) Sport_Model>=0.5 11     9230000 15700 *
FALSE          25) HP>=104 22    40000000 17600 *
FALSE        13) Age_08_04< 22.5 61   261000000 18600  
FALSE          26) HP< 104 30   120000000 17600  
FALSE            52) Quarterly_Tax< 222 24    26000000 17000  
FALSE             104) Age_08_04>=14 19    11500000 16600 *
FALSE             105) Age_08_04< 14 5      740000 18500 *
FALSE            53) Quarterly_Tax>=222 6    43700000 20200  
FALSE             106) KM>=4.43e+04 4     5370000 18500 *
FALSE             107) KM< 4.43e+04 2     3750000 23600 *
FALSE          27) HP>=104 31    91800000 19500  
FALSE            54) Quarterly_Tax< 44 5     6780000 17000 *
FALSE            55) Quarterly_Tax>=44 26    48000000 19900  
FALSE             110) Airco< 0.5 1           0 16400 *
FALSE             111) Airco>=0.5 25    34700000 20100 *
FALSE       7) HP>=113 9   113000000 23500  
FALSE        14) Quarterly_Tax< 258 8    22600000 22400  
FALSE          28) Age_08_04>=15 5     2890000 21300 *
FALSE          29) Age_08_04< 15 3     2670000 24300 *
FALSE        15) Quarterly_Tax>=258 1           0 32500 *
Tree_1_Importance <- as.data.frame(Tree_1$variable.importance)

DT::datatable(Tree_1_Importance)

Using our Regression Tree output, we can see that the first 4 split are based on Age only. Then the 4 most important variables are Age_08_04, KM, Quarterly_Tax and HP.

ii. RMS Error between Training and Validation

# Set Seed 
set.seed(1)

# Predictions of Training and Validation
Training_Predictions <- predict(Tree_1,Training_TREE)
Validation_Predictions <- predict(Tree_1, Validation_TREE)

# RMSE  ------------------------------------------------------------------------

# Training RMSE
RMSE_Training <- RMSE(Training_Predictions,Training_TREE$Price)

# Validation RMSE 
RMSE_Validation <- RMSE(Validation_Predictions, Validation_TREE$Price)

# All RMSE
BIND_Training_Validation <- cbind(RMSE_Training,RMSE_Validation)
RMSE_Dataframe <- as.data.frame(BIND_Training_Validation)

flextable(RMSE_Dataframe) %>% set_header_labels(RMSE_Dataframe, values = list(RMSE_Training="RMSE Training",RMSE_Validation="RMSE Validation")
  )

Computing Residuals Boxplots

# Set Seed 
set.seed(1)

# Computing Residuals --------------------------------------------------------------
Residuals_Training_Tree_1 <- Training_TREE$Price - Training_Predictions
Residuals_Validation_Tree_1 <- Validation_TREE$Price - Validation_Predictions

Residuals_Training_Tree_1 <- as.data.frame(Residuals_Training_Tree_1)
Residuals_Validation_Tree_1 <- as.data.frame(Residuals_Validation_Tree_1)

# Boxplots -------------------------------------------------------------------------

library(ggplot2)

# Boxplot Training
box1 <- ggplotly(ggplot(Residuals_Training_Tree_1) +
 aes(x = "", y = Residuals_Training_Tree_1) +
 geom_boxplot(fill = "#1c6155") +
 labs(x = "with 882 observations", y = "Price Residuals", title = "Residuals Training VS Predicted", subtitle = "Boxplot", 
 caption = "") +
 theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))

# Boxplot Validation
box2 <- ggplotly(ggplot(Residuals_Validation_Tree_1) +
 aes(x = "", y = Residuals_Validation_Tree_1) +
 geom_boxplot(fill = "#1c6155") +
 labs(x = "with 554 observations", y = "Price Residuals", title = "Residuals Validation VS Predicted", subtitle = "Boxplot", 
 caption = "") +
 theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))

# Combine them
combineWidgets(box1,box2, ncol=2)

iii. How can we achieve predictions for the training set that are not equal to the actual prices?

blablababla

iv. Prune the full tree using the cross-validation error. Compared to the full tree, what is the predictive performance for the validation set?

CP Table of our full Tree Regression (CP=0.001)

# Set Seed 
set.seed(1)

cp_2 = 0.001

Tree_Full <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_2, minbucket = minbucket, method=method))

DT::datatable(Tree_Full$cptable)

Prune Tree with optimal

Pruned_Tree <- prune(Tree_Full, cp = best_CP)

# Plotting Pruned Tree
Pruned_Tree_Plot <- rpart.plot(Pruned_Tree, type=0, varlen = 0, box.col=ifelse(Pruned_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE)

# Number of Leafs
Length_Tree_Pruned = length(Pruned_Tree$frame$var[Pruned_Tree$frame$var == "<leaf>"])

print(paste("There is", Length_Tree_Pruned,"Number of Leaves"))
FALSE [1] "There is 28 Number of Leaves"

Predictions Accuracy on Validation Set with Prune Tree

# Predicting
Predictions_Full_Tree <- predict(Tree_Full, Validation_TREE)
Predictions_Prune_Tree <- predict(Pruned_Tree, Validation_TREE)

# RMSE of Full and Prune Tree
RMSE_Full_Tree <- RMSE(Validation_TREE$Price, Predictions_Full_Tree)
RMSE_Prune_Tree <- RMSE(Validation_TREE$Price, Predictions_Prune_Tree)

# Data frame RMSE
RMSE_Full_Tree_2 <- cbind(RMSE_Prune_Tree,RMSE_Validation)
RMSE_Full_Tree_2 <- as.data.frame(RMSE_Full_Tree_2)

# Printing Table
flextable(RMSE_Full_Tree_2) %>% set_header_labels(RMSE_Full_Tree_2,RMSE_Full_Tree="RMSE Validation - Full Tree with CP=0",RMSE_Prune_Tree="RMSE Validation - Prune Tree with CP=0.00121", RMSE_Validation="RMSE Validation - First Tree with CP=0.001")

We can see that Prunning our Regression Tree helped us getting a smaller RMSE compared to the First Tree. By default we used CP=0.001 in both training Regression Tree but by choosing the smallest x-error, the Prune Regression Tree had CP=0.00121

b. Price Variable into Categorical Variable

Here are the Binned_Price Variable into 20 levels

# Duplicating Training Data 
New_Training_Tree <- Training_TREE

# Cutting into 20 breaks the Price Variable
New_Training_Tree$Price <- cut(New_Training_Tree$Price, breaks = 20)

# Renaming Price -> Binned_Price
colnames(New_Training_Tree)[3] <- "Binned_Price"

# Printing Levels of Binned_Price
print(paste(levels(New_Training_Tree$Binned_Price)))
FALSE  [1] "(4.32e+03,5.76e+03]" "(5.76e+03,7.16e+03]" "(7.16e+03,8.57e+03]"
FALSE  [4] "(8.57e+03,9.98e+03]" "(9.98e+03,1.14e+04]" "(1.14e+04,1.28e+04]"
FALSE  [7] "(1.28e+04,1.42e+04]" "(1.42e+04,1.56e+04]" "(1.56e+04,1.7e+04]" 
FALSE [10] "(1.7e+04,1.84e+04]"  "(1.84e+04,1.98e+04]" "(1.98e+04,2.12e+04]"
FALSE [13] "(2.12e+04,2.26e+04]" "(2.26e+04,2.41e+04]" "(2.41e+04,2.55e+04]"
FALSE [16] "(2.55e+04,2.69e+04]" "(2.69e+04,2.83e+04]" "(2.83e+04,2.97e+04]"
FALSE [19] "(2.97e+04,3.11e+04]" "(3.11e+04,3.25e+04]"

i. Compare CT and RT. Are they different? (Structure, Top Predictors, Size of tree, etc.) Why?

Running Class Tree

# Running Class Tree
CT_Tree <- rpart(Binned_Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = New_Training_Tree, method = "class")

# Plotting Tree
CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE)

# Number of Leafs
Length_Tree_CT = length(CT_Tree$frame$var[CT_Tree$frame$var == "<leaf>"])
print(paste("There is", Length_Tree_CT,"Number of Leaves"))
FALSE [1] "There is 8 Number of Leaves"

We can see that this Class Tree is having 8 Terminal Nodes (or Leaves) and way smaller in depth compared to the regression Tree we used in the exercices before. There is up to 4 layers in this CT.

Importance of Variables in CT

# Most 3 or 4 specifications for predictions in car's price
CT_Tree_Importance <- as.data.frame(CT_Tree$variable.importance)

DT::datatable(CT_Tree_Importance)

Using our Class Tree output, we can see that the 4 most important variables are Age_08_04, KM, CD_Player and Airco

ii. Predict the price, using the RT and the CT, of a used Toyota Corolla with the following specifications.

# Dataframe Car Specifcations for Predictions 
Prediction_1_Car <- data.frame("Age_08_04"=77,"KM"=117000, "Fuel_Type"="Petrol","HP"=110,"Automatic"=0,"Doors"=5,"Quarterly_Tax"=100,"Mfr_Guarantee"=0,"Guarantee_Period"=3, "Airco"=1, "Automatic"=0, "CD_Player"=0, "Powered_Windows"=0, "Sport_Model"=0, "Tow_Bar"=1)

# Predicting Car Price with CT
Predicted_Car_Price <- predict(CT_Tree,Prediction_1_Car)

# Removing 0 Probabilities
Predicted_Car_Price <- Predicted_Car_Price[, colSums(Predicted_Car_Price != 0) > 0]

# As Dataframe for ggplotting
Predicted_Car_Price <- as.data.frame(Predicted_Car_Price)

# Renaming Column
colnames(Predicted_Car_Price) <- "Probabilities in %"

# Probabilities Rounding and %
Predicted_Car_Price$`Probabilities in %` <- Predicted_Car_Price$`Probabilities in %` * 100

Predicted_Car_Price$`Probabilities in %` <- round(Predicted_Car_Price$`Probabilities in %`,2)

# Flextable
DT::datatable(Predicted_Car_Price)

iii. Compare the predictions in terms of the predictors that were used, the magnitude of the difference between the two predictions, and the advantages and disadvantages of the two methods.

CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE)

By using the CT Tree, we first start with Age_08_04 >= 57 -> Yes, then Age_08_04 >=70 -> Yes, then KM >= 174000 -> No and we stop at (7.16e+03,8.57e+03] which is 53.57% probability for this specific car. We are using far less steps to get to our predictions compared to the Regression Tree. By using CT instead of RT, we are not getting an exact value of price, but instead a probability distribution of the different Price Bins our car could be in and thus selecting the highest probability would get our car’s price into a category. CT have less precisions in outcome predictions but have far less branches and terminal nodes, thus more easy for reading and understanding each variables importance in the prediction process, as well as giving probabilities, which can be useful depending on the context.

Ex 11.3

Car Sales

The Goal is to predict the price of a used Toyota Corolla based on its specifications.

a. Fit a neural network model to the data. Use a single hidden layer with 2 nodes.

Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.

Splitting Training and Validation Set - 60% VS 40%

# Reproducible Results
set.seed(1)

# required Packages
library(neuralnet)
library(nnet)
library(caret)

# Load the Dataset with Fread()
ToyotaDT_NN <- fread("DATA/ToyotaCorolla.csv")

# Select the 15 variables 
ToyotaDT_NN <- ToyotaDT_NN[,c("Price","Age_08_04","KM","Fuel_Type","HP","Automatic","Doors","Quarterly_Tax","Mfr_Guarantee","Guarantee_Period","Airco","Automatic_airco","CD_Player","Powered_Windows","Sport_Model","Tow_Bar")]

# Input Cutting Ratio
probability_train <- 0.6
probability_test <- 1 - probability_train

# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_NN), replace=TRUE, prob=c(probability_train,probability_test))
Train_NN  <- ToyotaDT_NN[sample, ]
Test_NN   <- ToyotaDT_NN[!sample, ]

# Proportions Check
Prop_Training <- nrow(Train_NN)/nrow(ToyotaDT_NN)*100
Prop_Validation <- nrow(Test_NN)/nrow(ToyotaDT_NN)*100

# Preprocess Variables - 8 Dummies - Fuel Type to Dummy - 7 Numerical

## create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + Fuel_Type, data=Train_NN))

## replace Fuel_Type column with 2 dummies
Train_NN <- cbind(Train_NN[,-c("Fuel_Type")], Fuel_Type[,])

## replace Fuel_Type column with 2 dummies
Test_NN <- cbind(Test_NN[,-c("Fuel_Type")], Fuel_Type[,])

## Numerical Processing with Training Set
preProcValues <- preProcess(Train_NN[,c("Price","Age_08_04","KM","HP","Doors","Quarterly_Tax","Guarantee_Period")], method = "range")

## Preprocess the Training Set
Train_NN_Preprocess <- predict(preProcValues, Train_NN)

## Preprocess the Test Set
Test_NN_Preprocess <- predict(preProcValues, Test_NN)

# Print Proportions

print(paste(round(Prop_Training,2),"% In Training", round(Prop_Validation,2),"% In Validation"))
FALSE [1] "61.42 % In Training 38.58 % In Validation"

Neural Network A - 1 Hidden Layers with 2 Nodes

# Reproducible Results
set.seed(1)

nn_1 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = 2)

plot(nn_1, rep="best")

Weights of the Neural Network A

# display weights
nn_1$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]   [,2]
FALSE  [1,] -0.4309  2.209
FALSE  [2,]  2.0122  3.652
FALSE  [3,]  0.8866 -1.037
FALSE  [4,] -0.7398 -1.433
FALSE  [5,] -0.1306  0.255
FALSE  [6,] -0.1019 -0.201
FALSE  [7,] -0.6180  1.546
FALSE  [8,] -0.0805 -0.176
FALSE  [9,]  0.1344 16.006
FALSE [10,] -0.0968 -0.293
FALSE [11,] -0.2474 35.270
FALSE [12,]  0.0789  1.118
FALSE [13,] -0.1134 -0.299
FALSE [14,]  0.0771  1.787
FALSE [15,]  0.1233  1.682
FALSE [16,]  0.7030 -1.676
FALSE [17,]  0.5789 -1.406
FALSE [18,]  0.7630  0.267
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]
FALSE [1,] -0.365
FALSE [2,] -1.028
FALSE [3,]  1.419

Neural Network B - 1 Hidden Layers with 5 Nodes

# Reproducible Results
set.seed(1)

nn_2 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = 5)

plot(nn_2, rep="best")

Weights of the Neural Network B

# display weights
nn_2$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]     [,2]    [,3]   [,4]     [,5]
FALSE  [1,] -1.0822   0.2664 -0.7609  1.869  0.39221
FALSE  [2,] -1.6464  -0.6606 -1.4234  1.682 -1.92659
FALSE  [3,] -0.3819   0.6737 -0.8228 -0.520 -1.17504
FALSE  [4,]  1.1282  -0.0761  0.9746 -0.426  0.19693
FALSE  [5,] -0.8555  -5.9541  0.8139  2.845  0.15236
FALSE  [6,] -0.2748  -0.8334  0.4679  0.254  0.16805
FALSE  [7,]  0.0783  -4.9264  1.6873  2.500 -0.09956
FALSE  [8,]  0.4218   0.6210  0.2834 -0.246 -0.39992
FALSE  [9,]  1.3905 -35.2628 -1.0621 18.735 -0.72520
FALSE [10,]  0.0566   0.4204  0.0491 -0.165  0.08564
FALSE [11,]  1.2989   4.3655 -0.5550 -0.871  0.03334
FALSE [12,] -0.2900  -1.4439  1.0142  1.175 -0.82484
FALSE [13,] -0.1653   1.9985  0.1753 -1.415  0.40796
FALSE [14,] -0.3060 -91.3117  0.2948  2.309 -0.00232
FALSE [15,]  0.2535 -78.1617 -0.0750  2.528 -0.44375
FALSE [16,]  1.1113   1.3446 -1.7674 -1.909 -1.48382
FALSE [17,]  0.5031  -1.6160 -1.2026 -0.971 -0.16114
FALSE [18,]  0.4911  -1.0049 -1.5398 -0.259 -0.24136
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]
FALSE [1,] -0.743
FALSE [2,]  0.439
FALSE [3,]  0.201
FALSE [4,]  0.414
FALSE [5,]  0.743
FALSE [6,]  0.496

Neural Network C - 2 Hidden Layers with 5 Nodes in each

# Reproducible Results
set.seed(1)

nn_3 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = c(5,5))

plot(nn_3, rep="best")

Weights of the Neural Network C

# display weights
nn_3$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]    [,2]   [,3]   [,4]     [,5]
FALSE  [1,]  1.0014  0.8207 -0.263 -1.308    1.154
FALSE  [2,]  2.6944 -3.8210  2.189 10.242   -2.117
FALSE  [3,]  1.9919 -1.8167  0.679 -1.140    0.783
FALSE  [4,]  2.6962  1.9304 -0.935 -5.662    9.611
FALSE  [5,] -2.0652 -0.1293 -0.492  1.182    0.806
FALSE  [6,]  0.0779 -0.6930 -1.155 -0.298   -0.300
FALSE  [7,] -5.3986  3.0127  1.366  4.488   12.672
FALSE  [8,]  2.3418  0.8618  0.378 -0.762    0.396
FALSE  [9,] -3.7351 -1.2264 -1.887 33.366   -3.809
FALSE [10,] -1.1707 -0.2756 -0.432 -1.375   -2.054
FALSE [11,] -5.5473 -0.0329 -0.332  4.946   -2.549
FALSE [12,]  1.0162  0.2500  0.318  3.204    6.706
FALSE [13,] -0.8608  0.4563  0.167  0.141   19.021
FALSE [14,] -4.3215 -0.0348  0.619  1.386   -1.194
FALSE [15,]  0.3951  0.1364  0.283  0.611    4.325
FALSE [16,] -2.9950 -3.3677 -0.303 -3.822 -341.739
FALSE [17,]  4.2604 -3.1110 -2.016 -5.497  -36.463
FALSE [18,]  0.9853 -0.4030 -0.868  0.543    0.255
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]   [,2]   [,3]   [,4]   [,5]
FALSE [1,] -0.495 -1.264 -0.924  -3.26 -0.322
FALSE [2,]  1.039 -1.681  0.586  -2.20 -0.930
FALSE [3,] 36.661 -2.006 -0.832 -11.08  0.957
FALSE [4,]  0.554  0.297  2.981  48.03 -0.701
FALSE [5,]  0.937 -0.477  0.413   8.22  0.725
FALSE [6,]  0.718  2.243  1.117   1.29 -0.291
FALSE 
FALSE [[1]][[3]]
FALSE        [,1]
FALSE [1,] -0.624
FALSE [2,]  1.224
FALSE [3,] -0.286
FALSE [4,] -0.324
FALSE [5,] -0.247
FALSE [6,]  0.745

Predictions and RMSE

RMSE For Training Set

# Preprocess Scale - Range Method ----------------------------

# Predictions for Training -----------------------------------

# Predictions with nn
Train_Prediction_nn_1 <- predict(nn_1,Train_NN_Preprocess)

# Predictions with nn_2
Train_Prediction_nn_2 <- predict(nn_2,Train_NN_Preprocess)

# Predictions with nn_3
Train_Prediction_nn_3 <- predict(nn_3,Train_NN_Preprocess)

# Back transform to Original Scale ---------------------------

# Predictions with nn
Train_Prediction_nn_1 <- Train_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_2
Train_Prediction_nn_2 <- Train_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_3
Train_Prediction_nn_3 <- Train_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# RMSE for Train  -------------------------------------------------------

RMSE_Train_Prediction_nn_1 <- RMSE(Train_Prediction_nn_1,Train_NN$Price)
RMSE_Train_Prediction_nn_2 <- RMSE(Train_Prediction_nn_2,Train_NN$Price)
RMSE_Train_Prediction_nn_3 <- RMSE(Train_Prediction_nn_3,Train_NN$Price)

RMSE <- c(RMSE_Train_Prediction_nn_1,RMSE_Train_Prediction_nn_2,RMSE_Train_Prediction_nn_3)

# Rounding RMSE
RMSE <- round(RMSE,2)

# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")

Frame_RMSE <- cbind(Model,RMSE)
Frame_RMSE <- as.data.frame(Frame_RMSE)

RMSE_DATA <- flextable(Frame_RMSE) %>% fontsize(size = 8, part = "all")

(RMSE_DATA)

RMSE For Validation Set

# Predictions for Validation --------------------------------

# Predictions with nn
Validation_Prediction_nn_1 <- predict(nn_1,Test_NN_Preprocess)

# Predictions with nn_2
Validation_Prediction_nn_2 <- predict(nn_2,Test_NN_Preprocess)

# Predictions with nn_3
Validation_Prediction_nn_3 <- predict(nn_3,Test_NN_Preprocess)

# Back transform to Original Scale ---------------------------

# Predictions with nn
Validation_Prediction_nn_1 <- Validation_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_2
Validation_Prediction_nn_2 <- Validation_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_3
Validation_Prediction_nn_3 <- Validation_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# RMSE for Validation  -------------------------------------------------------

RMSE_Validation_Prediction_1 <- RMSE(Validation_Prediction_nn_1, Test_NN$Price)
RMSE_Validation_Prediction_2 <- RMSE(Validation_Prediction_nn_2, Test_NN$Price)
RMSE_Validation_Prediction_3 <- RMSE(Validation_Prediction_nn_3, Test_NN$Price)

RMSE_Validation <- c(RMSE_Validation_Prediction_1,RMSE_Validation_Prediction_2,RMSE_Validation_Prediction_3)

# Rounding RMSE
RMSE_Validation <- round(RMSE_Validation,2)

# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")

Frame_RMSE_Validation <- cbind(Model,RMSE_Validation)
Frame_RMSE_Validation <- as.data.frame(Frame_RMSE_Validation)

RMSE_VALIDATION_DATA <- flextable(Frame_RMSE_Validation) %>% fontsize(size = 8, part = "all")
RMSE_VALIDATION_DATA <- set_header_labels(RMSE_VALIDATION_DATA, RMSE_Validation = "RMSE")

(RMSE_VALIDATION_DATA)

i. What happens to the RMS error for the training data as the number of layers and nodes increases ?

RMSE tend to decrease the more when complexify our neural network (Model C in that case). This indicates we are overfitting more and more our model to the training dataset.

ii. What happens to the RMS error for the validation data?

The RMSE for the Validation Data is lower for the Neural Model B, and then it increases with the model C.

iii. Comment on the appropriate number of layers and nodes for this application.

Model A and B are close, but Model A is not in a overfitting situation, we should decide wether we prefer good predictions with Model B and more overfitting to the training dataset, or Model A which give less accurate result but with more stability in future predictions since it is not overfitting such as Model C. In this case, I would prefer the Model B giving more accurate results but more prone to instability, thus choosing 1 Hidden Layers with 5 Nodes.